#%pip install catboost
# Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
from sklearn.calibration import calibration_curve
import plotly.express as px
# Reading the file
df = pd.read_csv('TakeHomeData.csv')
df.head()
| year | client | months_as_client_in_2023 | client_start_date | client_budget | client_spend | impressions_received | clicks_received | calls_received | emails_received | web_events_received | leads_received | client_state | bc_name | churn_indicator | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023 | 1 | 12 | 1/5/2023 | 48,406.80 | 47,210.69 | 253,746,288 | 5,792,808 | 83,868 | 348 | 10,092 | 94,308 | FL | Dentists & Dental Services | 0 |
| 1 | 2023 | 2 | 12 | 1/5/2023 | 1,400.00 | 1,473.72 | 5,545,008 | 229,936 | 4,788 | - | 4,032 | 8,820 | NY | Attorneys & Legal Services | 0 |
| 2 | 2023 | 3 | 12 | 1/23/2023 | 400,896.00 | 321,417.48 | 479,778,784 | 19,068,544 | 2,007,264 | 12,064 | 470,496 | 2,489,824 | NC | Home & Home Improvement | 0 |
| 3 | 2023 | 4 | 12 | 1/1/2023 | 290,286.40 | 231,168.99 | 303,924,144 | 11,370,874 | 863,078 | 6,352 | 185,002 | 1,054,432 | TX | Home & Home Improvement | 0 |
| 4 | 2023 | 5 | 12 | 1/11/2023 | 13,070.75 | 12,559.39 | 16,054,082 | 1,330,064 | 158,110 | - | 38,024 | 196,134 | FL | Automotive -- Repair, Service & Parts | 0 |
# Feature Types
df.dtypes
year int64 client int64 months_as_client_in_2023 int64 client_start_date object client_budget object client_spend object impressions_received object clicks_received object calls_received object emails_received object web_events_received object leads_received object client_state object bc_name object churn_indicator int64 dtype: object
# Dataset Size
df.shape
(8000, 15)
# Numeric Conversion
columns_to_convert = [
'client_budget', 'client_spend', 'impressions_received',
'clicks_received', 'calls_received', 'emails_received',
'web_events_received', 'leads_received'
]
for col in columns_to_convert:
df[col] = pd.to_numeric(df[col].str.replace(',', ''), errors='coerce')
# Feature Types
df.dtypes
year int64 client int64 months_as_client_in_2023 int64 client_start_date object client_budget float64 client_spend float64 impressions_received int64 clicks_received int64 calls_received float64 emails_received float64 web_events_received float64 leads_received float64 client_state object bc_name object churn_indicator int64 dtype: object
df.describe()
| year | client | months_as_client_in_2023 | client_budget | client_spend | impressions_received | clicks_received | calls_received | emails_received | web_events_received | leads_received | churn_indicator | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8000.0 | 8000.00000 | 8000.000000 | 8.000000e+03 | 8.000000e+03 | 8.000000e+03 | 8.000000e+03 | 7.647000e+03 | 3.122000e+03 | 7.646000e+03 | 7.854000e+03 | 8000.000000 |
| mean | 2023.0 | 4000.50000 | 9.789875 | 2.279044e+04 | 2.010503e+04 | 5.567114e+07 | 4.046339e+06 | 2.127724e+05 | 5.849077e+03 | 2.311937e+05 | 4.345605e+05 | 0.407125 |
| std | 0.0 | 2309.54541 | 3.321950 | 3.150488e+05 | 2.641288e+05 | 9.897457e+08 | 8.419519e+07 | 5.064632e+06 | 2.888547e+04 | 4.161257e+06 | 8.793591e+06 | 0.491329 |
| min | 2023.0 | 1.00000 | 1.000000 | 1.200000e+00 | 6.600000e-01 | 1.488000e+03 | 7.700000e+01 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 0.000000 |
| 25% | 2023.0 | 2000.75000 | 8.000000 | 8.994000e+02 | 8.093425e+02 | 1.254452e+06 | 8.008100e+04 | 3.192000e+03 | 1.450000e+02 | 1.061000e+03 | 5.460250e+03 | 0.000000 |
| 50% | 2023.0 | 4000.50000 | 12.000000 | 2.630250e+03 | 2.449515e+03 | 4.585024e+06 | 2.890400e+05 | 1.311500e+04 | 5.510000e+02 | 4.474000e+03 | 2.125100e+04 | 0.000000 |
| 75% | 2023.0 | 6000.25000 | 12.000000 | 7.165650e+03 | 6.634185e+03 | 1.360352e+07 | 8.667595e+05 | 4.561800e+04 | 2.781000e+03 | 1.883225e+04 | 7.748875e+04 | 1.000000 |
| max | 2023.0 | 8000.00000 | 12.000000 | 2.351699e+07 | 1.928980e+07 | 6.094081e+10 | 5.035279e+09 | 3.739859e+08 | 1.045928e+06 | 3.215831e+08 | 6.956959e+08 | 1.000000 |
# Missing Values
df.isnull().sum()
year 0 client 0 months_as_client_in_2023 0 client_start_date 0 client_budget 0 client_spend 0 impressions_received 0 clicks_received 0 calls_received 353 emails_received 4878 web_events_received 354 leads_received 146 client_state 6 bc_name 22 churn_indicator 0 dtype: int64
# Percentage of Missing Values
df.isnull().mean()*100
year 0.0000 client 0.0000 months_as_client_in_2023 0.0000 client_start_date 0.0000 client_budget 0.0000 client_spend 0.0000 impressions_received 0.0000 clicks_received 0.0000 calls_received 4.4125 emails_received 60.9750 web_events_received 4.4250 leads_received 1.8250 client_state 0.0750 bc_name 0.2750 churn_indicator 0.0000 dtype: float64
df.dropna(subset=['calls_received', 'web_events_received', 'leads_received'], inplace=True)
df['client_state'] = df['client_state'].fillna('Unknown')
df['bc_name'] = df['bc_name'].fillna('Unknown')
df.drop('emails_received', axis=1, inplace=True)
df.isnull().mean()*100
year 0.0 client 0.0 months_as_client_in_2023 0.0 client_start_date 0.0 client_budget 0.0 client_spend 0.0 impressions_received 0.0 clicks_received 0.0 calls_received 0.0 web_events_received 0.0 leads_received 0.0 client_state 0.0 bc_name 0.0 churn_indicator 0.0 dtype: float64
# Churn by Campaign Performance: Do higher impressions, clicks, or leads protect against churn?
metrics = ["impressions_received", "clicks_received", "leads_received"]
cut_percentile = 95
plt.figure(figsize=(15, 10))
# Original Boxplots
for i, metric in enumerate(metrics, 1):
plt.subplot(2, 3, i) # top row
sns.boxplot(data=df, x='churn_indicator', y=metric, palette='Set2')
plt.title(f'{metric.replace("_", " ").capitalize()} by Churn')
plt.xlabel('Churn Indicator')
plt.ylabel(metric.replace('_', ' ').capitalize())
# Modified to remove outliers
for i, metric in enumerate(metrics, 1):
plt.subplot(2, 3, i + 3)
cutoff_value = df[metric].quantile(cut_percentile / 100.0)
filtered_df = df[df[metric] <= cutoff_value]
sns.boxplot(data=filtered_df, x='churn_indicator', y=metric, palette='Set2')
plt.title(f'{metric.replace("_", " ").capitalize()} by Churn (Without Outliers)')
plt.xlabel('Churn Indicator')
plt.ylabel(metric.replace('_', ' ').capitalize())
# mean for each group
group_means_filtered = filtered_df.groupby('churn_indicator')[metric].mean()
for j, mean_val in enumerate(group_means_filtered):
plt.scatter(j, mean_val, color='red', s=100, label='Mean' if i == 1 and j == 0 else "")
plt.tight_layout()
plt.show()
# the bulk of the data is concentrated at lower values for both groups
# Higher impressions, clicks, or leads alone might not strongly protect against churn
# Budget vs. Spend Efficiency by Churn: Are churned clients less efficient in using their budget?
df['spend_efficiency'] = df['client_spend'] / df['client_budget']
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='churn_indicator', y='spend_efficiency', palette='coolwarm')
# Mean for each group
group_means = df.groupby('churn_indicator')['spend_efficiency'].mean()
for i, mean_val in enumerate(group_means):
plt.scatter(i, mean_val, color='red', s=100, label='Mean' if i == 0 else "")
plt.title('Spend Efficiency by Churn Indicator')
plt.xlabel('Churn Indicator')
plt.ylabel('Spend Efficiency (Spend / Budget)')
plt.legend(title="Annotations")
plt.show()
# churned clients seem to have a wider spread in their spend efficiency,
# with a larger number of outliers at both lower and higher efficiency levels
# indicate inconsistencies in budget managment
# State-wise Churn Rates: Do some states exhibit consistently higher churn rates?
# Churn Rate by State
churn_rates = df.groupby('client_state')['churn_indicator'].mean().sort_values()
fig = px.bar(
x=churn_rates.index,
y=churn_rates.values,
title="Churn Rates by Client State",
labels={'x': 'Client State', 'y': 'Churn Rate'},
text=churn_rates.values
)
# Better Appearance
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(
xaxis=dict(tickangle=90),
yaxis_title="Churn Rate",
xaxis_title="Client State",
height=600,
width=900
)
fig.show()
# Campaign Performance and Lead Conversion: Does better lead conversion reduce churn risk?
df['conversion_rate'] = df['leads_received'] / df['impressions_received']
# Outlier Cutoff
cutoff_v = df['conversion_rate'].quantile(0.95)
filtered_df = df[df['conversion_rate'] <= cutoff_v]
plt.figure(figsize=(16, 6))
# Original
plt.subplot(1, 2, 1)
sns.boxplot(data=df, x='churn_indicator', y='conversion_rate', palette='Set3')
mean_original = df.groupby('churn_indicator')['conversion_rate'].mean()
for i, mean_val in enumerate(mean_original):
plt.scatter(i, mean_val, color='red', s=100, label='Mean' if i == 0 else "")
plt.title('Conversion Rate by Churn Indicator (Original)')
plt.xlabel('Churn Indicator')
plt.ylabel('Conversion Rate (Leads / Impressions)')
plt.legend(title="Annotations", loc='upper right')
# Without Outliers
plt.subplot(1, 2, 2)
sns.boxplot(data=filtered_df, x='churn_indicator', y='conversion_rate', palette='Set3')
mean_filtered = filtered_df.groupby('churn_indicator')['conversion_rate'].mean()
for i, mean_val in enumerate(mean_filtered):
plt.scatter(i, mean_val, color='red', s=100, label='Mean' if i == 0 else "")
plt.title('Conversion Rate by Churn Indicator (Without Outliers)')
plt.xlabel('Churn Indicator')
plt.ylabel('Conversion Rate (Leads / Impressions)')
plt.legend(title="Annotations", loc='upper right')
plt.tight_layout()
plt.show()
# similar in its overall distribution
# low median conversion rate for both groups
# Churn Indicator Distribution
churn_distribution = df['churn_indicator'].value_counts(normalize=True)
labels = ['Not Churned (0)', 'Churned (1)']
sizes = churn_distribution.values
plt.figure(figsize=(8, 8))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, textprops={'fontsize': 14})
plt.title("Churn Indicator Distribution", fontsize=16)
plt.axis('equal') # Circular
plt.show()
# Engagment to Impression Ratio
df['eng_to_imp_ratio '] = (df['clicks_received'] + df['calls_received'] + df['web_events_received'] + df['leads_received']) / df['impressions_received']
# Define features and target
X = df.drop(columns=['year','months_as_client_in_2023','client','churn_indicator', 'client_start_date'])
y = df['churn_indicator']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)
model = CatBoostClassifier(
iterations=500,
learning_rate=0.01,
depth=6,
loss_function='Logloss',
eval_metric='AUC',
cat_features=['client_state', 'bc_name'], # Categorical features
verbose=100
)
# Train
model.fit(x_train, y_train, eval_set=(x_test, y_test), early_stopping_rounds=50)
0: test: 0.7888070 best: 0.7888070 (0) total: 82.1ms remaining: 41s 100: test: 0.8023052 best: 0.8023052 (100) total: 424ms remaining: 1.68s 200: test: 0.8075692 best: 0.8075992 (199) total: 762ms remaining: 1.13s 300: test: 0.8110885 best: 0.8110885 (300) total: 1.12s remaining: 741ms 400: test: 0.8135437 best: 0.8136095 (399) total: 1.54s remaining: 381ms 499: test: 0.8155854 best: 0.8155854 (499) total: 1.9s remaining: 0us bestTest = 0.8155853971 bestIteration = 499
<catboost.core.CatBoostClassifier at 0x1479d6f50>
y_pred = model.predict(x_test)
y_proba = model.predict_proba(x_test)[:, 1]
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
# Receiver Operating Characteristic - Area Under the Curve: Handles Imbalance Dataset
Accuracy: 0.7583892617449665
ROC AUC: 0.8155853970796416
Classification Report:
precision recall f1-score support
0 0.76 0.87 0.81 897
1 0.75 0.60 0.66 593
accuracy 0.76 1490
macro avg 0.76 0.73 0.74 1490
weighted avg 0.76 0.76 0.75 1490
def p_roc_curve(y_test, y_proba):
roc_auc = roc_auc_score(y_test, y_proba)
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.grid(True)
plt.show()
def p_confusion_matrix(y_test, y_pred):
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Not Churned', 'Churned'])
disp.plot(cmap='Blues', values_format='d')
plt.title("Confusion Matrix")
plt.grid(False)
plt.show()
p_roc_curve(y_test, y_proba)
p_confusion_matrix(y_test, y_pred)
# ROC Curve vs AUC?
# Higher Precision or Higher Recall for Churn Prediction
feature_imp = model.get_feature_importance()
feature_names = x_train.columns
feature_imp = pd.DataFrame({
'Feature': feature_names,
'Importance': feature_imp
})
top_features = feature_imp.nlargest(10, 'Importance')
# top 10 features
plt.figure(figsize=(8, 6))
plt.barh(top_features['Feature'], top_features['Importance'], color='skyblue')
plt.xlabel('Importance Score', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.title('Top 10 Feature Importances', fontsize=16)
plt.gca().invert_yaxis() # Flip
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()
# Calibration Plot:
def calibration_plot(y_test, y_proba, n_bins=10):
# Compute
prob_true, prob_pred = calibration_curve(y_test, y_proba, n_bins=n_bins, strategy='uniform')
plt.figure(figsize=(8, 6))
plt.plot(prob_pred, prob_true, marker='o', label='Model')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfect Calibration')
plt.title('Calibration Plot', fontsize=16)
plt.xlabel('Mean Predicted Probability', fontsize=14)
plt.ylabel('Fraction of Positives', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True)
plt.show()
calibration_plot(y_test, y_proba)
# helps in fine-tuning